import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn import model_selection
from sklearn.metrics import precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold, LeavePOut
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, LeaveOneOut, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTENC, SMOTE, ADASYN
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from scipy.stats import zscore
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
A. Import ‘signal-data.csv’ as DataFrame. [2 Marks]
df_signal_data = pd.read_csv('signal-data.csv')
df_signal_data.shape
(1567, 592)
df_signal_data.columns
Index(['Time', '0', '1', '2', '3', '4', '5', '6', '7', '8',
...
'581', '582', '583', '584', '585', '586', '587', '588', '589',
'Pass/Fail'],
dtype='object', length=592)
df_signal_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1567 entries, 0 to 1566 Columns: 592 entries, Time to Pass/Fail dtypes: float64(590), int64(1), object(1) memory usage: 7.1+ MB
df_signal_data.count()
Time 1567
0 1561
1 1560
2 1553
3 1553
...
586 1566
587 1566
588 1566
589 1566
Pass/Fail 1567
Length: 592, dtype: int64
df_signal_data.describe()
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1561.000000 | 1560.000000 | 1553.000000 | 1553.000000 | 1553.000000 | 1553.0 | 1553.000000 | 1558.000000 | 1565.000000 | 1565.000000 | ... | 618.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1566.000000 | 1567.000000 |
| mean | 3014.452896 | 2495.850231 | 2200.547318 | 1396.376627 | 4.197013 | 100.0 | 101.112908 | 0.121822 | 1.462862 | -0.000841 | ... | 97.934373 | 0.500096 | 0.015318 | 0.003847 | 3.067826 | 0.021458 | 0.016475 | 0.005283 | 99.670066 | -0.867262 |
| std | 73.621787 | 80.407705 | 29.513152 | 441.691640 | 56.355540 | 0.0 | 6.237214 | 0.008961 | 0.073897 | 0.015116 | ... | 87.520966 | 0.003404 | 0.017180 | 0.003720 | 3.578033 | 0.012358 | 0.008808 | 0.002867 | 93.891919 | 0.498010 |
| min | 2743.240000 | 2158.750000 | 2060.660000 | 0.000000 | 0.681500 | 100.0 | 82.131100 | 0.000000 | 1.191000 | -0.053400 | ... | 0.000000 | 0.477800 | 0.006000 | 0.001700 | 1.197500 | -0.016900 | 0.003200 | 0.001000 | 0.000000 | -1.000000 |
| 25% | 2966.260000 | 2452.247500 | 2181.044400 | 1081.875800 | 1.017700 | 100.0 | 97.920000 | 0.121100 | 1.411200 | -0.010800 | ... | 46.184900 | 0.497900 | 0.011600 | 0.003100 | 2.306500 | 0.013425 | 0.010600 | 0.003300 | 44.368600 | -1.000000 |
| 50% | 3011.490000 | 2499.405000 | 2201.066700 | 1285.214400 | 1.316800 | 100.0 | 101.512200 | 0.122400 | 1.461600 | -0.001300 | ... | 72.288900 | 0.500200 | 0.013800 | 0.003600 | 2.757650 | 0.020500 | 0.014800 | 0.004600 | 71.900500 | -1.000000 |
| 75% | 3056.650000 | 2538.822500 | 2218.055500 | 1591.223500 | 1.525700 | 100.0 | 104.586700 | 0.123800 | 1.516900 | 0.008400 | ... | 116.539150 | 0.502375 | 0.016500 | 0.004100 | 3.295175 | 0.027600 | 0.020300 | 0.006400 | 114.749700 | -1.000000 |
| max | 3356.350000 | 2846.440000 | 2315.266700 | 3715.041700 | 1114.536600 | 100.0 | 129.252200 | 0.128600 | 1.656400 | 0.074900 | ... | 737.304800 | 0.509800 | 0.476600 | 0.104500 | 99.303200 | 0.102800 | 0.079900 | 0.028600 | 737.304800 | 1.000000 |
8 rows × 591 columns
df_signal_data.nunique()
Time 1534
0 1520
1 1504
2 507
3 518
...
586 322
587 260
588 120
589 611
Pass/Fail 2
Length: 592, dtype: int64
df_signal_data.dtypes
Time object
0 float64
1 float64
2 float64
3 float64
...
586 float64
587 float64
588 float64
589 float64
Pass/Fail int64
Length: 592, dtype: object
B. Print 5 point summary and share at least 2 observations. [3 Marks]
Descriptive Statistics involves understanding the distribution and nature of the data. Five number summary is a part of descriptive statistics and consists of five values and all these values will help us to describe the data.
df_signal_data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1561.0 | 3014.452896 | 73.621787 | 2743.2400 | 2966.260000 | 3011.4900 | 3056.6500 | 3356.3500 |
| 1 | 1560.0 | 2495.850231 | 80.407705 | 2158.7500 | 2452.247500 | 2499.4050 | 2538.8225 | 2846.4400 |
| 2 | 1553.0 | 2200.547318 | 29.513152 | 2060.6600 | 2181.044400 | 2201.0667 | 2218.0555 | 2315.2667 |
| 3 | 1553.0 | 1396.376627 | 441.691640 | 0.0000 | 1081.875800 | 1285.2144 | 1591.2235 | 3715.0417 |
| 4 | 1553.0 | 4.197013 | 56.355540 | 0.6815 | 1.017700 | 1.3168 | 1.5257 | 1114.5366 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 586 | 1566.0 | 0.021458 | 0.012358 | -0.0169 | 0.013425 | 0.0205 | 0.0276 | 0.1028 |
| 587 | 1566.0 | 0.016475 | 0.008808 | 0.0032 | 0.010600 | 0.0148 | 0.0203 | 0.0799 |
| 588 | 1566.0 | 0.005283 | 0.002867 | 0.0010 | 0.003300 | 0.0046 | 0.0064 | 0.0286 |
| 589 | 1566.0 | 99.670066 | 93.891919 | 0.0000 | 44.368600 | 71.9005 | 114.7497 | 737.3048 |
| Pass/Fail | 1567.0 | -0.867262 | 0.498010 | -1.0000 | -1.000000 | -1.0000 | -1.0000 | 1.0000 |
591 rows × 8 columns
Observations:
- Data contains 591 columns and total of 1567 records
- The 0th column contains 1561 records, 2966 25th percentile (Q1), 3011 50th percentile (Q2), 3356 75th percentile (Q3), 2743 minimum and 3356 maximum, 3014 mean and 73 standard deviation.
A. Write a for loop which will remove all the features with 20%+ Null values and impute rest with mean of the feature. [5 Marks]
# Below code gives percentage of null in every column
null_percentage = df_signal_data.isnull().sum() / df_signal_data.shape[0] * 100
print(null_percentage)
Time 0.000000
0 0.382897
1 0.446713
2 0.893427
3 0.893427
...
586 0.063816
587 0.063816
588 0.063816
589 0.063816
Pass/Fail 0.000000
Length: 592, dtype: float64
# Below code gives list of columns having more than 20% null
columns_to_drop = null_percentage[null_percentage > 0.20].keys()
print(columns_to_drop)
Index(['0', '1', '2', '3', '4', '5', '6', '7', '19', '40',
...
'564', '565', '566', '567', '568', '569', '578', '579', '580', '581'],
dtype='object', length=338)
for column in df_signal_data.select_dtypes([np.number]).columns: # Loop through columns
null_percentage = df_signal_data[column].isnull().sum() / len(df_signal_data)
null_percentage = null_percentage * 100
if null_percentage > 0.20:
df_signal_data.drop([column], axis = 1, inplace = True) # drop this column if missing value percentage is more than 20%
else:
mean = df_signal_data[column].mean()
df_signal_data[column] = df_signal_data[column].fillna(mean) # impute missing values with mean of the column if missing value percentage is less than 20%
df_signal_data.columns
Index(['Time', '8', '9', '10', '11', '12', '13', '14', '15', '16',
...
'577', '582', '583', '584', '585', '586', '587', '588', '589',
'Pass/Fail'],
dtype='object', length=254)
# Below code gives percentage of null in every column
null_percentage = df_signal_data.isnull().sum() / df_signal_data.shape[0] * 100
print(null_percentage)
Time 0.0
8 0.0
9 0.0
10 0.0
11 0.0
...
586 0.0
587 0.0
588 0.0
589 0.0
Pass/Fail 0.0
Length: 254, dtype: float64
B. Identify and drop the features which are having same value for all the rows. [3 Marks]
nunique = df_signal_data.nunique()
cols_to_drop = nunique[nunique == 1].index
print('Columns which are unique and to drop: ', cols_to_drop)
Columns which are unique and to drop: Index(['13', '42', '49', '52', '149', '179', '186', '189', '284', '315', '322',
'325', '422', '451', '458', '461'],
dtype='object')
df_signal_data.drop(cols_to_drop, axis = 1, inplace = True)
nunique = df_signal_data.nunique()
cols_to_drop = nunique[nunique == 1].index
print('Columns which are unique and to drop: ', cols_to_drop)
Columns which are unique and to drop: Index([], dtype='object')
C. Drop other features if required using relevant functional knowledge. Clearly justify the same. [2 Marks]
E. Make all relevant modifications on the data using both functional/logical reasoning/assumptions. [2 Marks]
df_signal_data.isnull().any().any()
False
# Checking for distribution of the target class shows that the data set is highly imbalanced
df_signal_data['Pass/Fail'].value_counts(normalize = True)
-1 0.933631 1 0.066369 Name: Pass/Fail, dtype: float64
# label encoding the target class
# df_signal_data['Pass/Fail'] = df_signal_data['Pass/Fail'].replace([-1, 1], [0, 1])
# Making a copy of the dataset and dropping the target class
df_signal_data_copy = df_signal_data.copy()
df_signal_data_copy.drop(['Pass/Fail'], axis = 1)
| Time | 8 | 9 | 10 | 11 | 12 | 14 | 15 | 16 | 17 | ... | 576 | 577 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008-07-19 11:55:00 | 1.500500 | 0.016200 | -0.003400 | 0.945500 | 202.439600 | 7.955800 | 414.871000 | 10.043300 | 0.968000 | ... | 1.6765 | 14.9509 | 0.5005 | 0.0118 | 0.0035 | 2.3630 | 0.021458 | 0.016475 | 0.005283 | 99.670066 |
| 1 | 2008-07-19 12:32:00 | 1.496600 | -0.000500 | -0.014800 | 0.962700 | 200.547000 | 10.154800 | 414.734700 | 9.259900 | 0.970100 | ... | 1.1065 | 10.9003 | 0.5019 | 0.0223 | 0.0055 | 4.4447 | 0.009600 | 0.020100 | 0.006000 | 208.204500 |
| 2 | 2008-07-19 13:17:00 | 1.443600 | 0.004100 | 0.001300 | 0.961500 | 202.017900 | 9.515700 | 416.707500 | 9.314400 | 0.967400 | ... | 2.0952 | 9.2721 | 0.4958 | 0.0157 | 0.0039 | 3.1745 | 0.058400 | 0.048400 | 0.014800 | 82.860200 |
| 3 | 2008-07-19 14:43:00 | 1.488200 | -0.012400 | -0.003300 | 0.962900 | 201.848200 | 9.605200 | 422.289400 | 9.692400 | 0.968700 | ... | 1.7585 | 8.5831 | 0.4990 | 0.0103 | 0.0025 | 2.0544 | 0.020200 | 0.014900 | 0.004400 | 73.843200 |
| 4 | 2008-07-19 15:22:00 | 1.503100 | -0.003100 | -0.007200 | 0.956900 | 201.942400 | 10.566100 | 420.592500 | 10.338700 | 0.973500 | ... | 1.6597 | 10.9698 | 0.4800 | 0.4766 | 0.1045 | 99.3032 | 0.020200 | 0.014900 | 0.004400 | 73.843200 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1562 | 2008-10-16 15:13:00 | 1.342400 | -0.004500 | -0.005700 | 0.957900 | 203.986700 | 11.769200 | 419.340400 | 10.239700 | 0.969300 | ... | 1.4879 | 11.7256 | 0.4988 | 0.0143 | 0.0039 | 2.8669 | 0.006800 | 0.013800 | 0.004700 | 203.172000 |
| 1563 | 2008-10-16 20:49:00 | 1.433300 | -0.006100 | -0.009300 | 0.961800 | 204.017300 | 9.162000 | 405.817800 | 10.228500 | 0.969600 | ... | 1.0187 | 17.8379 | 0.4975 | 0.0131 | 0.0036 | 2.6238 | 0.006800 | 0.013800 | 0.004700 | 203.172000 |
| 1564 | 2008-10-17 05:26:00 | 1.462862 | -0.000841 | 0.000146 | 0.964353 | 199.956809 | 9.005371 | 413.086035 | 9.907603 | 0.971444 | ... | 1.2237 | 17.7267 | 0.4987 | 0.0153 | 0.0041 | 3.0590 | 0.019700 | 0.008600 | 0.002500 | 43.523100 |
| 1565 | 2008-10-17 06:01:00 | 1.462200 | -0.007200 | 0.003200 | 0.969400 | 197.244800 | 9.735400 | 401.915300 | 9.863000 | 0.974000 | ... | 1.7085 | 19.2104 | 0.5004 | 0.0178 | 0.0038 | 3.5662 | 0.026200 | 0.024500 | 0.007500 | 93.494100 |
| 1566 | 2008-10-17 06:07:00 | 1.462862 | -0.000841 | 0.000146 | 0.964353 | 199.956809 | 9.005371 | 413.086035 | 9.907603 | 0.971444 | ... | 1.2878 | 22.9183 | 0.4987 | 0.0181 | 0.0040 | 3.6275 | 0.011700 | 0.016200 | 0.004500 | 137.784400 |
1567 rows × 237 columns
# Create correlation matrix
corr_matrix = df_signal_data_copy.select_dtypes([np.number]).corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.70
to_drop = [column for column in upper.columns if any(upper[column] > 0.70)]
# Drop features
df_signal_data_copy.drop(to_drop, axis = 1, inplace=True)
C:\Users\Bhavya Govindrao\AppData\Local\Temp\ipykernel_10760\4237487680.py:5: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
row, column = df_signal_data_copy.shape
print('After dropping the correlated variables the dataset contains', row, 'rows and', column, 'columns')
After dropping the correlated variables the dataset contains 1567 rows and 107 columns
# Boxplot to check for outliers
plt.figure(figsize = (50, 50))
col = 1
for i in df_signal_data_copy.select_dtypes([np.number]).columns:
plt.subplot(20, 10, col)
sns.boxplot(df_signal_data_copy[i], color = 'blue')
col += 1
# Find the outliers and replace them by median
for i in df_signal_data_copy.select_dtypes([np.number]).columns:
q1 = df_signal_data_copy[i].quantile(0.25)
q3 = df_signal_data_copy[i].quantile(0.75)
iqr = q3 - q1
low = q1 - 1.5 * iqr
high = q3 + 1.5 * iqr
df_signal_data_copy.loc[(df_signal_data_copy[i] < low) | (df_signal_data_copy[i] > high), i] = df_signal_data_copy[i].median()
# After treating the outlier values
plt.figure(figsize = (50, 50))
col = 1
for i in df_signal_data_copy.select_dtypes([np.number]).columns:
plt.subplot(20, 10, col)
sns.boxplot(df_signal_data_copy[i], color = 'blue')
col += 1
# Plotting histogram to check for the frequency of values within a variable
df_signal_data_copy.hist(bins = 30, figsize = (40, 40), color = 'blue')
plt.show()
# Density plot to check for the distribution of the variables
plt.figure(figsize = (40, 40))
col = 1
for i in df_signal_data_copy.select_dtypes([np.number]).columns:
plt.subplot(20, 10, col)
sns.histplot(df_signal_data_copy[i], color = 'blue')
col += 1
D. Check for multi-collinearity in the data and take necessary action. [3 Marks]
# Calculate the correlation matrix
def calculate_correlation_matrix(df_signal_data, threshold):
plt.figure(figsize = (20, 18))
corr_matrix = df_signal_data.select_dtypes([np.number]).corr()
sns.heatmap(abs(corr_matrix > 0.7), cmap = "Greens");
# Remove the highly collinear features from dataframe
def remove_collinear_features(df_signal_data, threshold):
# Calculate the correlation matrix
corr_matrix = df_signal_data.select_dtypes([np.number]).corr()
iterations = range(len(corr_matrix.columns) - 1)
drop_columns = []
# Iterate through the correlation matrix and compare correlations
for i in iterations:
for j in range(i+1):
item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
col = item.columns
row = item.index
val = abs(item.values)
# If correlation exceeds the threshold
if val >= threshold:
# Print the correlated features and the correlation value
print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
drop_columns.append(col.values[0])
# Drop one of each pair of correlated columns
drops = set(drop_cols)
df_signal_data = df_signal_data.drop(columns = drops, axis = 1, inplace = True)
return df_signal_data
threshold = 0.70
calculate_correlation_matrix(df_signal_data, threshold)
# Remove columns having more than 70% correlation
# Both positive and negative correlations are considered here
threshold = 0.70
data = remove_collinear_features(df_signal_data, threshold)
17 | 11 | 0.79 22 | 21 | 0.73 26 | 25 | 0.82 27 | 25 | 0.98 27 | 26 | 0.79 30 | 29 | 0.86 34 | 32 | 0.75 35 | 34 | 0.77 36 | 32 | 0.75 36 | 34 | 1.0 36 | 35 | 0.77 39 | 34 | 0.8 39 | 36 | 0.8 46 | 45 | 0.81 50 | 46 | 0.9 51 | 47 | 0.71 147 | 16 | 0.89 148 | 16 | 0.97 148 | 147 | 0.89 152 | 16 | 0.98 152 | 147 | 0.9 152 | 148 | 0.99 154 | 16 | 0.87 154 | 147 | 0.8 154 | 148 | 0.94 154 | 152 | 0.89 163 | 26 | 0.71 163 | 159 | 0.76 164 | 26 | 0.77 164 | 159 | 0.8 164 | 163 | 0.92 165 | 26 | 0.74 165 | 159 | 0.79 165 | 163 | 0.9 165 | 164 | 0.96 174 | 172 | 1.0 185 | 184 | 0.71 187 | 185 | 0.83 248 | 114 | 0.7 249 | 114 | 0.98 249 | 248 | 0.73 252 | 117 | 0.99 254 | 119 | 0.8 279 | 144 | 0.98 280 | 145 | 0.96 281 | 146 | 0.95 282 | 16 | 0.88 282 | 147 | 1.0 282 | 148 | 0.89 282 | 152 | 0.89 282 | 154 | 0.8 283 | 16 | 0.97 283 | 147 | 0.89 283 | 148 | 1.0 283 | 152 | 0.99 283 | 154 | 0.94 283 | 282 | 0.89 285 | 150 | 0.97 286 | 151 | 0.99 287 | 16 | 0.98 287 | 147 | 0.9 287 | 148 | 0.99 287 | 152 | 1.0 287 | 154 | 0.89 287 | 282 | 0.89 287 | 283 | 0.99 288 | 153 | 1.0 289 | 16 | 0.88 289 | 147 | 0.81 289 | 148 | 0.94 289 | 152 | 0.89 289 | 154 | 0.99 289 | 282 | 0.81 289 | 283 | 0.94 289 | 287 | 0.89 291 | 156 | 0.99 294 | 159 | 0.99 294 | 163 | 0.79 294 | 164 | 0.83 294 | 165 | 0.82 295 | 160 | 1.0 296 | 161 | 0.99 297 | 162 | 0.99 298 | 26 | 0.73 298 | 159 | 0.77 298 | 163 | 0.99 298 | 164 | 0.94 298 | 165 | 0.92 298 | 294 | 0.81 299 | 26 | 0.77 299 | 159 | 0.8 299 | 163 | 0.92 299 | 164 | 1.0 299 | 165 | 0.96 299 | 294 | 0.83 299 | 298 | 0.95 300 | 26 | 0.75 300 | 159 | 0.79 300 | 163 | 0.9 300 | 164 | 0.97 300 | 165 | 1.0 300 | 294 | 0.82 300 | 298 | 0.93 300 | 299 | 0.97 301 | 166 | 0.96 302 | 167 | 0.98 303 | 168 | 0.96 304 | 27 | 0.71 304 | 169 | 0.98 305 | 170 | 0.96 306 | 171 | 0.99 307 | 172 | 0.96 307 | 174 | 0.96 308 | 173 | 0.96 309 | 172 | 0.96 309 | 174 | 0.96 309 | 307 | 1.0 310 | 175 | 0.96 311 | 176 | 0.98 312 | 177 | 1.0 316 | 180 | 0.88 317 | 181 | 0.96 318 | 182 | 0.98 319 | 183 | 0.98 320 | 184 | 0.99 320 | 185 | 0.72 321 | 184 | 0.71 321 | 185 | 0.99 321 | 187 | 0.83 321 | 320 | 0.72 323 | 185 | 0.82 323 | 187 | 0.99 323 | 321 | 0.82 324 | 188 | 0.98 356 | 218 | 0.95 359 | 221 | 0.98 360 | 222 | 0.99 361 | 223 | 0.98 365 | 227 | 0.97 366 | 228 | 0.97 376 | 238 | 0.97 377 | 239 | 0.95 386 | 248 | 1.0 386 | 249 | 0.73 387 | 114 | 0.98 387 | 248 | 0.73 387 | 249 | 1.0 387 | 386 | 0.73 388 | 250 | 0.97 389 | 251 | 1.0 390 | 117 | 0.99 390 | 252 | 1.0 392 | 119 | 0.79 392 | 254 | 0.99 393 | 255 | 0.99 417 | 144 | 0.99 417 | 279 | 0.97 420 | 16 | 0.9 420 | 147 | 1.0 420 | 148 | 0.9 420 | 152 | 0.91 420 | 154 | 0.81 420 | 282 | 1.0 420 | 283 | 0.9 420 | 287 | 0.91 420 | 289 | 0.82 421 | 16 | 0.96 421 | 147 | 0.89 421 | 148 | 1.0 421 | 152 | 0.98 421 | 154 | 0.95 421 | 282 | 0.88 421 | 283 | 1.0 421 | 287 | 0.98 421 | 289 | 0.95 421 | 420 | 0.9 424 | 151 | 0.98 424 | 286 | 0.97 425 | 16 | 0.94 425 | 147 | 0.87 425 | 148 | 0.96 425 | 152 | 0.98 425 | 154 | 0.86 425 | 282 | 0.87 425 | 283 | 0.96 425 | 287 | 0.97 425 | 289 | 0.86 425 | 420 | 0.88 425 | 421 | 0.95 426 | 153 | 1.0 426 | 288 | 0.99 427 | 16 | 0.89 427 | 147 | 0.82 427 | 148 | 0.95 427 | 152 | 0.91 427 | 154 | 1.0 427 | 282 | 0.82 427 | 283 | 0.95 427 | 287 | 0.91 427 | 289 | 0.99 427 | 420 | 0.83 427 | 421 | 0.97 427 | 425 | 0.88 429 | 156 | 1.0 429 | 291 | 0.99 430 | 26 | 0.74 430 | 159 | 0.87 430 | 163 | 0.83 430 | 164 | 0.88 430 | 165 | 0.85 430 | 294 | 0.89 430 | 298 | 0.84 430 | 299 | 0.87 430 | 300 | 0.85 431 | 26 | 0.74 431 | 160 | 0.81 431 | 163 | 0.81 431 | 164 | 0.85 431 | 165 | 0.81 431 | 294 | 0.72 431 | 295 | 0.83 431 | 298 | 0.83 431 | 299 | 0.85 431 | 300 | 0.82 431 | 430 | 0.9 434 | 26 | 0.82 434 | 159 | 0.71 434 | 163 | 0.88 434 | 164 | 0.9 434 | 165 | 0.86 434 | 294 | 0.75 434 | 298 | 0.89 434 | 299 | 0.89 434 | 300 | 0.86 434 | 430 | 0.95 434 | 431 | 0.93 435 | 26 | 0.83 435 | 159 | 0.71 435 | 163 | 0.84 435 | 164 | 0.91 435 | 165 | 0.87 435 | 294 | 0.75 435 | 298 | 0.86 435 | 299 | 0.9 435 | 300 | 0.86 435 | 430 | 0.95 435 | 431 | 0.93 435 | 434 | 0.99 436 | 26 | 0.81 436 | 159 | 0.71 436 | 163 | 0.84 436 | 164 | 0.9 436 | 165 | 0.88 436 | 294 | 0.75 436 | 298 | 0.86 436 | 299 | 0.9 436 | 300 | 0.87 436 | 430 | 0.95 436 | 431 | 0.93 436 | 434 | 0.99 436 | 435 | 1.0 437 | 166 | 0.99 437 | 301 | 0.95 439 | 168 | 0.79 439 | 303 | 0.77 440 | 27 | 0.71 440 | 169 | 1.0 440 | 304 | 0.98 441 | 170 | 0.99 441 | 305 | 0.95 442 | 171 | 0.97 442 | 306 | 0.96 443 | 172 | 1.0 443 | 174 | 1.0 443 | 307 | 0.96 443 | 309 | 0.96 444 | 173 | 0.99 444 | 308 | 0.95 445 | 172 | 1.0 445 | 174 | 1.0 445 | 307 | 0.96 445 | 309 | 0.96 445 | 443 | 0.99 446 | 175 | 1.0 446 | 310 | 0.95 447 | 176 | 1.0 447 | 311 | 0.98 448 | 177 | 1.0 448 | 312 | 1.0 452 | 180 | 0.99 452 | 316 | 0.86 453 | 181 | 1.0 453 | 317 | 0.96 454 | 182 | 0.99 454 | 318 | 0.97 455 | 183 | 1.0 455 | 319 | 0.98 456 | 184 | 0.97 456 | 185 | 0.71 456 | 320 | 0.96 456 | 321 | 0.72 457 | 185 | 1.0 457 | 187 | 0.81 457 | 320 | 0.7 457 | 321 | 0.99 457 | 323 | 0.8 457 | 456 | 0.71 459 | 185 | 0.82 459 | 187 | 1.0 459 | 321 | 0.82 459 | 323 | 0.99 459 | 457 | 0.81 490 | 218 | 0.98 490 | 356 | 0.93 493 | 221 | 1.0 493 | 359 | 0.98 494 | 222 | 1.0 494 | 360 | 1.0 495 | 223 | 1.0 495 | 361 | 0.97 520 | 248 | 1.0 520 | 249 | 0.73 520 | 386 | 1.0 520 | 387 | 0.73 522 | 250 | 0.99 522 | 388 | 0.96 523 | 251 | 1.0 523 | 389 | 1.0 524 | 117 | 0.98 524 | 252 | 1.0 524 | 390 | 1.0 526 | 119 | 0.81 526 | 254 | 1.0 526 | 392 | 0.99 527 | 255 | 1.0 527 | 393 | 0.98 545 | 543 | 0.99 560 | 559 | 0.89 561 | 559 | 0.98 561 | 560 | 0.82 573 | 572 | 0.79 574 | 572 | 0.99 574 | 573 | 0.78 575 | 572 | 0.78 575 | 573 | 0.98 575 | 574 | 0.77 576 | 572 | 0.99 576 | 573 | 0.79 576 | 574 | 0.99 576 | 575 | 0.78 577 | 572 | 0.86 577 | 573 | 0.96 577 | 574 | 0.85 577 | 575 | 0.93 577 | 576 | 0.86 584 | 583 | 0.99 585 | 583 | 1.0 585 | 584 | 1.0 588 | 587 | 0.97
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[33], line 5 1 # Remove columns having more than 70% correlation 2 # Both positive and negative correlations are considered here 4 threshold = 0.70 ----> 5 data = remove_collinear_features(df_signal_data, threshold) Cell In[31], line 24, in remove_collinear_features(df_signal_data, threshold) 21 drop_columns.append(col.values[0]) 23 # Drop one of each pair of correlated columns ---> 24 drops = set(drop_cols) 25 df_signal_data = df_signal_data.drop(columns = drops, axis = 1, inplace = True) 27 return df_signal_data NameError: name 'drop_cols' is not defined
A. Perform a detailed univariate Analysis with appropriate detailed comments after each analysis. [2 Marks]
df_signal_data.skew()
C:\Users\Bhavya Govindrao\AppData\Local\Temp\ipykernel_10760\2830632690.py:1: FutureWarning: The default value of numeric_only in DataFrame.skew is deprecated. In a future version, it will default to False. In addition, specifying 'numeric_only=None' is deprecated. Select only valid columns or specify the value of numeric_only to silence this warning. df_signal_data.skew()
8 -0.073824
9 0.331433
10 0.057724
11 -10.221613
12 7.209324
...
586 1.438483
587 1.948028
588 2.030018
589 2.715340
Pass/Fail 3.487359
Length: 237, dtype: float64
# Create correlation matrix
corr_matrix = df_signal_data.select_dtypes([np.number]).corr().abs()
# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# Find features with correlation greater than 0.70
to_drop = [column for column in upper.columns if any(upper[column] > 0.70)]
# Drop features
df_signal_data.drop(to_drop, axis = 1, inplace=True)
C:\Users\Bhavya Govindrao\AppData\Local\Temp\ipykernel_10760\1222408100.py:5: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
row, column = df_signal_data.shape
print('After dropping the correlated variables the dataset contains', row, 'rows and', column, 'columns')
After dropping the correlated variables the dataset contains 1567 rows and 107 columns
# Boxplot to check for outliers
plt.figure(figsize = (50, 50))
col = 1
for i in df_signal_data.select_dtypes([np.number]).columns:
plt.subplot(20, 10, col)
sns.boxplot(df_signal_data[i], color = 'blue')
col += 1
# Plotting histogram to check for the frequency of values within a variable
df_signal_data.hist(bins = 30, figsize = (40, 40), color = 'blue')
plt.show()
# Density plot to check for the distribution of the variables
plt.figure(figsize = (40, 40))
col = 1
for i in df_signal_data.select_dtypes([np.number]).columns:
plt.subplot(20, 10, col)
sns.histplot(df_signal_data[i], color = 'blue')
col += 1
B. Perform bivariate and multivariate analysis with appropriate detailed comments after each analysis. [3 Marks]
plt.style.use('seaborn-deep')
plt.rcParams['figure.figsize'] = (5, 5)
plt.pie(df_signal_data['Pass/Fail'].value_counts(), labels = ['Pass', 'Fail'], colors = ['blue', 'green'], explode = [0, 0.1], autopct = "%.2f%%", shadow = True)
plt.axis('off')
plt.title('Target: Pass or Fail', fontsize = 20)
plt.legend()
plt.show()
df_signal_data['Pass/Fail'].value_counts().plot(kind="bar");
C:\Users\Bhavya Govindrao\AppData\Local\Temp\ipykernel_10760\1510467266.py:1: MatplotlibDeprecationWarning: The seaborn styles shipped by Matplotlib are deprecated since 3.6, as they no longer correspond to the styles shipped by seaborn. However, they will remain available as 'seaborn-v0_8-<style>'. Alternatively, directly use the seaborn API instead.
plt.style.use('seaborn-deep')
# heatmap to get correlation
plt.rcParams['figure.figsize'] = (18, 18)
sns.heatmap(df_signal_data.corr(), cmap = "YlGnBu")
plt.title('Correlation heatmap for the Data', fontsize = 20)
C:\Users\Bhavya Govindrao\AppData\Local\Temp\ipykernel_10760\1586070570.py:4: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning. sns.heatmap(df_signal_data.corr(), cmap = "YlGnBu")
Text(0.5, 1.0, 'Correlation heatmap for the Data')
A. Segregate predictors vs target attributes. [2 Marks]
# Deleting the first column
df_signal_data = df_signal_data.drop(columns='Time')
df_signal_data.head()
| 8 | 9 | 10 | 11 | 12 | 14 | 15 | 16 | 18 | 20 | ... | 559 | 570 | 571 | 572 | 582 | 583 | 586 | 587 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.5005 | 0.0162 | -0.0034 | 0.9455 | 202.4396 | 7.9558 | 414.8710 | 10.0433 | 192.3963 | 1.4026 | ... | 0.4385 | 533.8500 | 2.1113 | 8.95 | 0.5005 | 0.0118 | 0.021458 | 0.016475 | 99.670066 | -1 |
| 1 | 1.4966 | -0.0005 | -0.0148 | 0.9627 | 200.5470 | 10.1548 | 414.7347 | 9.2599 | 191.2872 | 1.3825 | ... | 0.1745 | 535.0164 | 2.4335 | 5.92 | 0.5019 | 0.0223 | 0.009600 | 0.020100 | 208.204500 | -1 |
| 2 | 1.4436 | 0.0041 | 0.0013 | 0.9615 | 202.0179 | 9.5157 | 416.7075 | 9.3144 | 192.7035 | 1.4123 | ... | 0.3718 | 535.0245 | 2.0293 | 11.21 | 0.4958 | 0.0157 | 0.058400 | 0.048400 | 82.860200 | 1 |
| 3 | 1.4882 | -0.0124 | -0.0033 | 0.9629 | 201.8482 | 9.6052 | 422.2894 | 9.6924 | 192.1557 | 1.4011 | ... | 0.7288 | 530.5682 | 2.0253 | 9.33 | 0.4990 | 0.0103 | 0.020200 | 0.014900 | 73.843200 | -1 |
| 4 | 1.5031 | -0.0031 | -0.0072 | 0.9569 | 201.9424 | 10.5661 | 420.5925 | 10.3387 | 191.6037 | 1.3888 | ... | 0.2156 | 532.0155 | 2.0275 | 8.83 | 0.4800 | 0.4766 | 0.020200 | 0.014900 | 73.843200 | -1 |
5 rows × 106 columns
# separating the dependent and independent data
x = df_signal_data.iloc[:, :106]
y = df_signal_data["Pass/Fail"]
# getting the shapes of new data sets x and y
print("shape of x:", x.shape)
print("shape of y:", y.shape)
shape of x: (1567, 106) shape of y: (1567,)
# splitting them into train test and split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1)
# gettiing the shapes
print("shape of x_train: ", x_train.shape)
print("shape of x_test: ", x_test.shape)
print("shape of y_train: ", y_train.shape)
print("shape of y_test: ", y_test.shape)
shape of x_train: (1096, 106) shape of x_test: (471, 106) shape of y_train: (1096,) shape of y_test: (471,)
print("{0:0.2f}% data is in training set".format((len(x_train) / len(df_signal_data.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test) / len(df_signal_data.index)) * 100))
69.94% data is in training set 30.06% data is in test set
print("Training Fail : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train)) * 100))
print("Training Pass : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train)) * 100))
print("")
print("Test Fail : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test)) * 100))
print("Test Pass : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test)) * 100))
print("")
Training Fail : 72 (6.57%) Training Pass : 0 (0.00%) Test Fail : 32 (6.79%) Test Pass : 0 (0.00%)
logisticRegression = LogisticRegression()
logisticRegression.fit(x_train, y_train)
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression()
predicted_labels = logisticRegression.predict(x_test)
print('Accuracy on Training data: ', logisticRegression.score(x_train, y_train))
print('Accuracy on Test data: ', logisticRegression.score(x_test, y_test))
Accuracy on Training data: 0.9324817518248175 Accuracy on Test data: 0.9299363057324841
confusion_matrix(y_test, predicted_labels)
array([[438, 1],
[ 32, 0]], dtype=int64)
# calculate accuracy measures and confusion matrix
cm = confusion_matrix(y_test, predicted_labels)
df_cm = pd.DataFrame(cm, index = [i for i in ["Pass","Fail"]], columns = [i for i in ["Pass","Fail"]])
plt.figure(figsize = (7, 5))
sns.heatmap(df_cm, annot = True, fmt = 'g')
plt.show()
classification_report(y_test, predicted_labels)
' precision recall f1-score support\n\n -1 0.93 1.00 0.96 439\n 1 0.00 0.00 0.00 32\n\n accuracy 0.93 471\n macro avg 0.47 0.50 0.48 471\nweighted avg 0.87 0.93 0.90 471\n'
B. Check for target balancing and fix it if found imbalanced. [3 Marks]
D. Check if the train and test data have similar statistical characteristics when compared with original data. [2 Marks]
# # Initializaing various classification algorithms with normal dataset and choosing the best model based on f1 score for tuning
# seed = 3
# # prepare models
# models = []
# models.append(('LR', LogisticRegression()))
# models.append(('KNN', KNeighborsClassifier(n_neighbors=13)))
# models.append(('GNB', GaussianNB()))
# #models.append(("SVM", SVC(kernel='linear', C=1, gamma=.6)))
# models.append(("DT", DecisionTreeClassifier()))
# models.append(("RF", RandomForestClassifier()))
# models.append(("AB", AdaBoostClassifier()))
# models.append(("GBT", GradientBoostingClassifier()))
# #models.append(("XGB", XGBClassifier(verbosity=0)))
# #models.append(("LightGBM",LGBMClassifier()))
# # evaluate each model in turn
# results = []
# names = []
# scoring = 'accuracy'
# for name, model in models:
# kfold = model_selection.KFold(n_splits = 5, random_state = seed, shuffle = True)
# cv_results = model_selection.cross_val_score(model, x, y, cv = kfold, scoring = scoring)
# results.append(cv_results)
# names.append(name)
# msg = "%s: %f (%f)" % (name, cv_results.mean() * 100, cv_results.std() * 100)
# print(msg)
# Implementing random under sampling
randomUnderSampler = RandomUnderSampler(sampling_strategy = 0.5)
x_under, y_under= randomUnderSampler.fit_resample(x_train, y_train)
print("Under Training Fail : {0} ({1:0.2f}%)".format(len(y_under[y_under[:] == 1]), (len(y_under[y_under[:] == 1])/len(y_under)) * 100))
print("under Training Pass : {0} ({1:0.2f}%)".format(len(y_under[y_under[:] == 0]), (len(y_under[y_under[:] == 0])/len(y_under)) * 100))
Under Training Fail : 72 (33.33%) under Training Pass : 0 (0.00%)
# Implementing SMOTE
smote = SMOTE(sampling_strategy = 0.5)
x_SMOTE, y_SMOTE = smote.fit_resample(x_train, y_train)
print("SMOTE Training Fail : {0} ({1:0.2f}%)".format(len(y_SMOTE[y_SMOTE[:] == 1]), (len(y_SMOTE[y_SMOTE[:] == 1])/len(y_SMOTE)) * 100))
print("SMOTE Training Pass : {0} ({1:0.2f}%)".format(len(y_SMOTE[y_SMOTE[:] == 0]), (len(y_SMOTE[y_SMOTE[:] == 0])/len(y_SMOTE)) * 100))
SMOTE Training Fail : 512 (33.33%) SMOTE Training Pass : 0 (0.00%)
# Implementing random over sampling
randomOverSampler = RandomOverSampler(sampling_strategy = 0.5)
x_over, y_over = randomOverSampler.fit_resample(x_train, y_train)
print("over Training Fail : {0} ({1:0.2f}%)".format(len(y_over[y_over[:] == 1]), (len(y_over[y_over[:] == 1])/len(y_over)) * 100))
print("over Training Pass : {0} ({1:0.2f}%)".format(len(y_over[y_over[:] == 0]), (len(y_over[y_over[:] == 0])/len(y_over)) * 100))
over Training Fail : 512 (33.33%) over Training Pass : 0 (0.00%)
# Implementing ADASYN sampling
adasyn = ADASYN(sampling_strategy=0.5)
x_adasyn, y_adasyn = adasyn.fit_resample(x_train, y_train)
print("ADASYN Training Fail : {0} ({1:0.2f}%)".format(len(y_adasyn[y_adasyn[:] == 1]), (len(y_adasyn[y_adasyn[:] == 1])/len(y_adasyn)) * 100))
print("ADASYN Training Pass : {0} ({1:0.2f}%)".format(len(y_adasyn[y_adasyn[:] == 0]), (len(y_adasyn[y_adasyn[:] == 0])/len(y_adasyn)) * 100))
ADASYN Training Fail : 527 (33.98%) ADASYN Training Pass : 0 (0.00%)
# # Initializaing various classification algorithms with normal dataset and choosing the best model based on f1 score for tuning
# seed = 3
# # prepare models
# models = []
# models.append(('LR', LogisticRegression()))
# models.append(('KNN', KNeighborsClassifier(n_neighbors=13)))
# models.append(('GNB', GaussianNB()))
# models.append(("SVM", SVC(kernel='linear', C=1, gamma=.6)))
# models.append(("DT", DecisionTreeClassifier()))
# models.append(("RF", RandomForestClassifier()))
# models.append(("AB", AdaBoostClassifier()))
# models.append(("GBT", GradientBoostingClassifier()))
# #models.append(("XGB", XGBClassifier(verbosity=0)))
# #models.append(("LightGBM",LGBMClassifier()))
# # evaluate each model in turn
# results = []
# names = []
# scoring = 'accuracy'
# for name, model in models:
# kfold = model_selection.KFold(n_splits = 5, random_state = seed, shuffle = True)
# cv_results = model_selection.cross_val_score(model, x, y, cv = kfold, scoring = scoring)
# results.append(cv_results)
# names.append(name)
# msg = "%s: %f (%f)" % (name, cv_results.mean() * 100, cv_results.std() * 100)
# print(msg)
C. Perform train-test split and standardise the data or vice versa if required. [3 Marks]
x.head()
| 8 | 9 | 10 | 11 | 12 | 14 | 15 | 16 | 18 | 20 | ... | 559 | 570 | 571 | 572 | 582 | 583 | 586 | 587 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.5005 | 0.0162 | -0.0034 | 0.9455 | 202.4396 | 7.9558 | 414.8710 | 10.0433 | 192.3963 | 1.4026 | ... | 0.4385 | 533.8500 | 2.1113 | 8.95 | 0.5005 | 0.0118 | 0.021458 | 0.016475 | 99.670066 | -1 |
| 1 | 1.4966 | -0.0005 | -0.0148 | 0.9627 | 200.5470 | 10.1548 | 414.7347 | 9.2599 | 191.2872 | 1.3825 | ... | 0.1745 | 535.0164 | 2.4335 | 5.92 | 0.5019 | 0.0223 | 0.009600 | 0.020100 | 208.204500 | -1 |
| 2 | 1.4436 | 0.0041 | 0.0013 | 0.9615 | 202.0179 | 9.5157 | 416.7075 | 9.3144 | 192.7035 | 1.4123 | ... | 0.3718 | 535.0245 | 2.0293 | 11.21 | 0.4958 | 0.0157 | 0.058400 | 0.048400 | 82.860200 | 1 |
| 3 | 1.4882 | -0.0124 | -0.0033 | 0.9629 | 201.8482 | 9.6052 | 422.2894 | 9.6924 | 192.1557 | 1.4011 | ... | 0.7288 | 530.5682 | 2.0253 | 9.33 | 0.4990 | 0.0103 | 0.020200 | 0.014900 | 73.843200 | -1 |
| 4 | 1.5031 | -0.0031 | -0.0072 | 0.9569 | 201.9424 | 10.5661 | 420.5925 | 10.3387 | 191.6037 | 1.3888 | ... | 0.2156 | 532.0155 | 2.0275 | 8.83 | 0.4800 | 0.4766 | 0.020200 | 0.014900 | 73.843200 | -1 |
5 rows × 106 columns
# Split x and y into training and test set in 70:30 ratio
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 10)
print("{0:0.2f}% data is in training set".format((len(x_train) / len(df_signal_data.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test) / len(df_signal_data.index)) * 100))
69.94% data is in training set 30.06% data is in test set
# Standardize the data training data
scaler = preprocessing.StandardScaler().fit(x_train)
x_train_scaled = scaler.transform(x_train)
# Standardize the data test data
scaler = preprocessing.StandardScaler().fit(x_test)
x_test_scaled = scaler.transform(x_test)
model = SVC()
model.fit(x_train_scaled, y_train)
prediction = model.predict(x_test_scaled)
# Check the accuracy on the training data
print('Accuracy on Training data: ', model.score(x_train, y_train))
# Check the accuracy on the testing data
print('Accuracy on Testing data: ', model.score(x_test, y_test))
# Calculate the recall value
print('Recall value: ', metrics.recall_score(y_test, prediction, average = 'macro'))
# Calculate the precision value
print('Precision value: ', metrics.precision_score(y_test, prediction, average = 'macro'))
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\base.py:413: UserWarning: X has feature names, but SVC was fitted without feature names warnings.warn( C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\base.py:413: UserWarning: X has feature names, but SVC was fitted without feature names warnings.warn(
Accuracy on Training data: 0.9315693430656934 Accuracy on Testing data: 0.9384288747346072 Recall value: 0.9137931034482758 Precision value: 0.9944071588366891
print("Confusion Matrix:\n", metrics.confusion_matrix(prediction, y_test))
Confusion Matrix: [[442 5] [ 0 24]]
print("Classification Report:\n", metrics.classification_report(prediction, y_test))
Classification Report:
precision recall f1-score support
-1 1.00 0.99 0.99 447
1 0.83 1.00 0.91 24
accuracy 0.99 471
macro avg 0.91 0.99 0.95 471
weighted avg 0.99 0.99 0.99 471
A. Use any Supervised Learning technique to train a model. [2 Marks]
x.head()
| 8 | 9 | 10 | 11 | 12 | 14 | 15 | 16 | 18 | 20 | ... | 559 | 570 | 571 | 572 | 582 | 583 | 586 | 587 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.5005 | 0.0162 | -0.0034 | 0.9455 | 202.4396 | 7.9558 | 414.8710 | 10.0433 | 192.3963 | 1.4026 | ... | 0.4385 | 533.8500 | 2.1113 | 8.95 | 0.5005 | 0.0118 | 0.021458 | 0.016475 | 99.670066 | -1 |
| 1 | 1.4966 | -0.0005 | -0.0148 | 0.9627 | 200.5470 | 10.1548 | 414.7347 | 9.2599 | 191.2872 | 1.3825 | ... | 0.1745 | 535.0164 | 2.4335 | 5.92 | 0.5019 | 0.0223 | 0.009600 | 0.020100 | 208.204500 | -1 |
| 2 | 1.4436 | 0.0041 | 0.0013 | 0.9615 | 202.0179 | 9.5157 | 416.7075 | 9.3144 | 192.7035 | 1.4123 | ... | 0.3718 | 535.0245 | 2.0293 | 11.21 | 0.4958 | 0.0157 | 0.058400 | 0.048400 | 82.860200 | 1 |
| 3 | 1.4882 | -0.0124 | -0.0033 | 0.9629 | 201.8482 | 9.6052 | 422.2894 | 9.6924 | 192.1557 | 1.4011 | ... | 0.7288 | 530.5682 | 2.0253 | 9.33 | 0.4990 | 0.0103 | 0.020200 | 0.014900 | 73.843200 | -1 |
| 4 | 1.5031 | -0.0031 | -0.0072 | 0.9569 | 201.9424 | 10.5661 | 420.5925 | 10.3387 | 191.6037 | 1.3888 | ... | 0.2156 | 532.0155 | 2.0275 | 8.83 | 0.4800 | 0.4766 | 0.020200 | 0.014900 | 73.843200 | -1 |
5 rows × 106 columns
# convert the features into z scores as we do not know what units / scales were used and store them in new dataframe
# It is always adviced to scale numeric attributes in models that calculate distances.
x_Scaled = x.apply(zscore) # convert all attributes to Z scale
x_Scaled.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| 8 | 1567.0 | 3.137815e-15 | 1.000319 | -3.682475 | -0.699101 | -0.017090 | 0.731294 | 2.621555 |
| 9 | 1567.0 | 9.068829e-18 | 1.000319 | -3.480332 | -0.659460 | -0.030392 | 0.611919 | 5.015393 |
| 10 | 1567.0 | 0.000000e+00 | 1.000319 | -3.771119 | -0.618275 | 0.027359 | 0.619189 | 5.687413 |
| 11 | 1567.0 | -3.827046e-15 | 1.000319 | -24.835180 | -0.502630 | 0.116335 | 0.558453 | 1.643651 |
| 12 | 1567.0 | 1.242430e-14 | 1.000319 | -5.489229 | -0.561085 | -0.128976 | 0.629945 | 22.152683 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 583 | 1567.0 | -5.668018e-18 | 1.000319 | -0.542709 | -0.216539 | -0.088401 | 0.068860 | 26.867231 |
| 586 | 1567.0 | 1.360324e-16 | 1.000319 | -3.105874 | -0.648441 | -0.077604 | 0.497282 | 6.586209 |
| 587 | 1567.0 | 9.068829e-18 | 1.000319 | -1.508184 | -0.667458 | -0.190289 | 0.434576 | 7.205831 |
| 589 | 1567.0 | 1.768422e-16 | 1.000319 | -1.062218 | -0.589367 | -0.294644 | 0.160709 | 6.795495 |
| Pass/Fail | 1567.0 | -4.534414e-18 | 1.000319 | -0.266621 | -0.266621 | -0.266621 | -0.266621 | 3.750641 |
106 rows × 8 columns
# Split x and y into training and test set in 70:30 ratio
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 1)
print("{0:0.2f}% data is in training set".format((len(x_train) / len(df_signal_data.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test) / len(df_signal_data.index)) * 100))
69.94% data is in training set 30.06% data is in test set
KNN = KNeighborsClassifier(n_neighbors= 5, metric = 'euclidean')
KNN.fit(x_train, y_train)
KNeighborsClassifier(metric='euclidean')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier(metric='euclidean')
# For every test data point, predict it's label based on 5 nearest neighbours in this model. The majority class will
# be assigned to the test data point
predicted_labels = KNN.predict(x_test)
print('Accuracy on Training data: ', KNN.score(x_train, y_train))
print('Accuracy on Test data: ', KNN.score(x_test, y_test))
Accuracy on Training data: 0.9361313868613139 Accuracy on Test data: 0.9320594479830149
confusion_matrix(y_test, predicted_labels)
array([[438, 1],
[ 31, 1]], dtype=int64)
# calculate accuracy measures and confusion matrix
cm = confusion_matrix(y_test, predicted_labels)
df_cm = pd.DataFrame(cm, index = [i for i in ["Pass", "Fail"]], columns = [i for i in ["Pass", "Fail"]])
plt.figure(figsize = (7, 5))
sns.heatmap(df_cm, annot = True, fmt = 'g')
plt.show()
classification_report(y_test, predicted_labels)
' precision recall f1-score support\n\n -1 0.93 1.00 0.96 439\n 1 0.50 0.03 0.06 32\n\n accuracy 0.93 471\n macro avg 0.72 0.51 0.51 471\nweighted avg 0.90 0.93 0.90 471\n'
B. Use cross validation techniques. [3 Marks]
Hint: Use all CV techniques that you have learnt in the course.
x.head()
| 8 | 9 | 10 | 11 | 12 | 14 | 15 | 16 | 18 | 20 | ... | 559 | 570 | 571 | 572 | 582 | 583 | 586 | 587 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.5005 | 0.0162 | -0.0034 | 0.9455 | 202.4396 | 7.9558 | 414.8710 | 10.0433 | 192.3963 | 1.4026 | ... | 0.4385 | 533.8500 | 2.1113 | 8.95 | 0.5005 | 0.0118 | 0.021458 | 0.016475 | 99.670066 | -1 |
| 1 | 1.4966 | -0.0005 | -0.0148 | 0.9627 | 200.5470 | 10.1548 | 414.7347 | 9.2599 | 191.2872 | 1.3825 | ... | 0.1745 | 535.0164 | 2.4335 | 5.92 | 0.5019 | 0.0223 | 0.009600 | 0.020100 | 208.204500 | -1 |
| 2 | 1.4436 | 0.0041 | 0.0013 | 0.9615 | 202.0179 | 9.5157 | 416.7075 | 9.3144 | 192.7035 | 1.4123 | ... | 0.3718 | 535.0245 | 2.0293 | 11.21 | 0.4958 | 0.0157 | 0.058400 | 0.048400 | 82.860200 | 1 |
| 3 | 1.4882 | -0.0124 | -0.0033 | 0.9629 | 201.8482 | 9.6052 | 422.2894 | 9.6924 | 192.1557 | 1.4011 | ... | 0.7288 | 530.5682 | 2.0253 | 9.33 | 0.4990 | 0.0103 | 0.020200 | 0.014900 | 73.843200 | -1 |
| 4 | 1.5031 | -0.0031 | -0.0072 | 0.9569 | 201.9424 | 10.5661 | 420.5925 | 10.3387 | 191.6037 | 1.3888 | ... | 0.2156 | 532.0155 | 2.0275 | 8.83 | 0.4800 | 0.4766 | 0.020200 | 0.014900 | 73.843200 | -1 |
5 rows × 106 columns
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 1)
# Implementing K-fold cross-validation
num_folds = 50
seed = 7
kfold = KFold(n_splits = num_folds, random_state = seed, shuffle = True)
model = LogisticRegression()
results = cross_val_score(model, x, y, cv = kfold)
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean() * 100.0, results.std() * 100.0))
Accuracy: 93.216% (5.051%)
print("Cross Validation score: {}".format(results))
Cross Validation score: [1. 0.9375 0.9375 0.96875 0.96875 0.90625 0.9375 0.96875 0.96875 1. 0.9375 0.875 1. 0.9375 0.875 1. 0.9375 0.93548387 0.87096774 0.90322581 0.87096774 0.93548387 0.93548387 0.93548387 0.93548387 1. 1. 0.90322581 0.93548387 0.93548387 0.87096774 0.87096774 0.93548387 0.87096774 0.90322581 0.77419355 0.87096774 0.96774194 0.90322581 1. 0.96774194 0.90322581 0.96774194 0.93548387 0.83870968 0.90322581 0.87096774 1. 1. 1. ]
print("Average Cross Validation score: {}".format(results.mean()))
Average Cross Validation score: 0.9321572580645161
# Implementing Stratified k-fold cross-validation
num_folds = 3
seed = 7
stratifiedKFold = StratifiedKFold(n_splits = num_folds, random_state = seed, shuffle = True)
model = LogisticRegression()
results = cross_val_score(model, x, y, cv = stratifiedKFold)
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean() * 100.0, results.std() * 100.0))
Accuracy: 93.172% (0.087%)
print("Cross Validation score: {}".format(results))
Cross Validation score: [0.93116635 0.93295019 0.93103448]
print("Average Cross Validation score: {}".format(results.mean()))
Average Cross Validation score: 0.9317170074406178
# Implementing Leave-p-out cross-validation
num_folds = 5
leavePOut = LeavePOut(p = 1)
leavePOut.get_n_splits(x)
model = RandomForestClassifier(n_estimators = num_folds, max_depth = 3, n_jobs = -1)
results = cross_val_score(model, x, y, cv = leavePOut)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean() * 100.0, results.std() * 100.0))
Accuracy: 94.065% (23.628%)
print("Cross Validation score: {}".format(results))
Cross Validation score: [1. 1. 1. ... 1. 1. 1.]
print("Average Cross Validation score: {}".format(results.mean()))
Average Cross Validation score: 0.9406509253350351
# Implementing Leave-one-out cross-validation
num_folds = 7
leaveOneOut = LeaveOneOut()
leavePOut.get_n_splits(x)
model = RandomForestClassifier(n_estimators = num_folds, max_depth = 4, n_jobs = -1)
results = cross_val_score(model, x, y, cv = leaveOneOut)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean() * 100.0, results.std() * 100.0))
Accuracy: 94.193% (23.388%)
print("Cross Validation score: {}".format(results))
Cross Validation score: [1. 1. 0. ... 1. 1. 1.]
print("Average Cross Validation score: {}".format(results.mean()))
Average Cross Validation score: 0.9419272495213784
# Implementing ShuffleSplit cross-validation
num_folds = 6
shuffleSplit = ShuffleSplit(n_splits = num_folds, test_size = 0.3, train_size = 0.5)
model = LogisticRegression()
results = cross_val_score(model, x, y, cv = shuffleSplit)
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
n_iter_i = _check_optimize_result(
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean() * 100.0, results.std() * 100.0))
Accuracy: 93.489% (1.121%)
print("Cross Validation score: {}".format(results))
Cross Validation score: [0.92356688 0.92356688 0.95116773 0.94904459 0.92993631 0.93205945]
print("Average Cross Validation score: {}".format(results.mean()))
Average Cross Validation score: 0.9348903043170559
C. Apply hyper-parameter tuning techniques to get the best accuracy. [3 Marks]
Suggestion: Use all possible hyper parameter combinations to extract the best accuracies.
x.head()
| 8 | 9 | 10 | 11 | 12 | 14 | 15 | 16 | 18 | 20 | ... | 559 | 570 | 571 | 572 | 582 | 583 | 586 | 587 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.5005 | 0.0162 | -0.0034 | 0.9455 | 202.4396 | 7.9558 | 414.8710 | 10.0433 | 192.3963 | 1.4026 | ... | 0.4385 | 533.8500 | 2.1113 | 8.95 | 0.5005 | 0.0118 | 0.021458 | 0.016475 | 99.670066 | -1 |
| 1 | 1.4966 | -0.0005 | -0.0148 | 0.9627 | 200.5470 | 10.1548 | 414.7347 | 9.2599 | 191.2872 | 1.3825 | ... | 0.1745 | 535.0164 | 2.4335 | 5.92 | 0.5019 | 0.0223 | 0.009600 | 0.020100 | 208.204500 | -1 |
| 2 | 1.4436 | 0.0041 | 0.0013 | 0.9615 | 202.0179 | 9.5157 | 416.7075 | 9.3144 | 192.7035 | 1.4123 | ... | 0.3718 | 535.0245 | 2.0293 | 11.21 | 0.4958 | 0.0157 | 0.058400 | 0.048400 | 82.860200 | 1 |
| 3 | 1.4882 | -0.0124 | -0.0033 | 0.9629 | 201.8482 | 9.6052 | 422.2894 | 9.6924 | 192.1557 | 1.4011 | ... | 0.7288 | 530.5682 | 2.0253 | 9.33 | 0.4990 | 0.0103 | 0.020200 | 0.014900 | 73.843200 | -1 |
| 4 | 1.5031 | -0.0031 | -0.0072 | 0.9569 | 201.9424 | 10.5661 | 420.5925 | 10.3387 | 191.6037 | 1.3888 | ... | 0.2156 | 532.0155 | 2.0275 | 8.83 | 0.4800 | 0.4766 | 0.020200 | 0.014900 | 73.843200 | -1 |
5 rows × 106 columns
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y, random_state = 7)
print("{0:0.2f}% data is in training set".format((len(x_train) / len(df_signal_data.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test) / len(df_signal_data.index)) * 100))
74.98% data is in training set 25.02% data is in test set
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_train)
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KNeighborsClassifier()
param_grid = {'n_neighbors': list(range(1, 9)), 'algorithm': ('auto', 'ball_tree', 'kd_tree' , 'brute')}
gs = GridSearchCV(knn_clf, param_grid, cv = 10)
gs.fit(x_train, y_train)
GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
param_grid={'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
param_grid={'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8]})KNeighborsClassifier()
KNeighborsClassifier()
gs.best_params_
{'algorithm': 'auto', 'n_neighbors': 6}
gs.cv_results_['params']
[{'algorithm': 'auto', 'n_neighbors': 1},
{'algorithm': 'auto', 'n_neighbors': 2},
{'algorithm': 'auto', 'n_neighbors': 3},
{'algorithm': 'auto', 'n_neighbors': 4},
{'algorithm': 'auto', 'n_neighbors': 5},
{'algorithm': 'auto', 'n_neighbors': 6},
{'algorithm': 'auto', 'n_neighbors': 7},
{'algorithm': 'auto', 'n_neighbors': 8},
{'algorithm': 'ball_tree', 'n_neighbors': 1},
{'algorithm': 'ball_tree', 'n_neighbors': 2},
{'algorithm': 'ball_tree', 'n_neighbors': 3},
{'algorithm': 'ball_tree', 'n_neighbors': 4},
{'algorithm': 'ball_tree', 'n_neighbors': 5},
{'algorithm': 'ball_tree', 'n_neighbors': 6},
{'algorithm': 'ball_tree', 'n_neighbors': 7},
{'algorithm': 'ball_tree', 'n_neighbors': 8},
{'algorithm': 'kd_tree', 'n_neighbors': 1},
{'algorithm': 'kd_tree', 'n_neighbors': 2},
{'algorithm': 'kd_tree', 'n_neighbors': 3},
{'algorithm': 'kd_tree', 'n_neighbors': 4},
{'algorithm': 'kd_tree', 'n_neighbors': 5},
{'algorithm': 'kd_tree', 'n_neighbors': 6},
{'algorithm': 'kd_tree', 'n_neighbors': 7},
{'algorithm': 'kd_tree', 'n_neighbors': 8},
{'algorithm': 'brute', 'n_neighbors': 1},
{'algorithm': 'brute', 'n_neighbors': 2},
{'algorithm': 'brute', 'n_neighbors': 3},
{'algorithm': 'brute', 'n_neighbors': 4},
{'algorithm': 'brute', 'n_neighbors': 5},
{'algorithm': 'brute', 'n_neighbors': 6},
{'algorithm': 'brute', 'n_neighbors': 7},
{'algorithm': 'brute', 'n_neighbors': 8}]
gs.cv_results_['mean_test_score']
array([0.88174707, 0.92937853, 0.92088223, 0.9327756 , 0.9319209 ,
0.93362306, 0.93362306, 0.93362306, 0.88174707, 0.92937853,
0.92088223, 0.9327756 , 0.9319209 , 0.93362306, 0.93362306,
0.93362306, 0.88174707, 0.92937853, 0.92088223, 0.9327756 ,
0.9319209 , 0.93362306, 0.93362306, 0.93362306, 0.88174707,
0.92937853, 0.92088223, 0.9327756 , 0.9319209 , 0.93362306,
0.93362306, 0.93362306])
D. Use any other technique/method which can enhance the model performance. [4 Marks]
Hint: Dimensionality reduction, attribute removal, standardisation/normalisation, target balancing etc.
x.head()
| 8 | 9 | 10 | 11 | 12 | 14 | 15 | 16 | 18 | 20 | ... | 559 | 570 | 571 | 572 | 582 | 583 | 586 | 587 | 589 | Pass/Fail | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.5005 | 0.0162 | -0.0034 | 0.9455 | 202.4396 | 7.9558 | 414.8710 | 10.0433 | 192.3963 | 1.4026 | ... | 0.4385 | 533.8500 | 2.1113 | 8.95 | 0.5005 | 0.0118 | 0.021458 | 0.016475 | 99.670066 | -1 |
| 1 | 1.4966 | -0.0005 | -0.0148 | 0.9627 | 200.5470 | 10.1548 | 414.7347 | 9.2599 | 191.2872 | 1.3825 | ... | 0.1745 | 535.0164 | 2.4335 | 5.92 | 0.5019 | 0.0223 | 0.009600 | 0.020100 | 208.204500 | -1 |
| 2 | 1.4436 | 0.0041 | 0.0013 | 0.9615 | 202.0179 | 9.5157 | 416.7075 | 9.3144 | 192.7035 | 1.4123 | ... | 0.3718 | 535.0245 | 2.0293 | 11.21 | 0.4958 | 0.0157 | 0.058400 | 0.048400 | 82.860200 | 1 |
| 3 | 1.4882 | -0.0124 | -0.0033 | 0.9629 | 201.8482 | 9.6052 | 422.2894 | 9.6924 | 192.1557 | 1.4011 | ... | 0.7288 | 530.5682 | 2.0253 | 9.33 | 0.4990 | 0.0103 | 0.020200 | 0.014900 | 73.843200 | -1 |
| 4 | 1.5031 | -0.0031 | -0.0072 | 0.9569 | 201.9424 | 10.5661 | 420.5925 | 10.3387 | 191.6037 | 1.3888 | ... | 0.2156 | 532.0155 | 2.0275 | 8.83 | 0.4800 | 0.4766 | 0.020200 | 0.014900 | 73.843200 | -1 |
5 rows × 106 columns
# Split x and y into training and test set in 70:30 ratio
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 1)
print("{0:0.2f}% data is in training set".format((len(x_train) / len(df_signal_data.index)) * 100))
print("{0:0.2f}% data is in test set".format((len(x_test) / len(df_signal_data.index)) * 100))
69.94% data is in training set 30.06% data is in test set
pipe_lr = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components = 2)), ('clf', LogisticRegression(random_state = 1))])
pipe_lr.fit(x_train, y_train)
print('Test Accuracy: %.3f' % pipe_lr.score(x_test, y_test))
Test Accuracy: 0.932
pipe_svc = Pipeline([('scl', StandardScaler()), ('pca', PCA()), ('svc', SVC())])
param_grid = {'pca__n_components': [14, 15], 'svc__C': [0.001, 0.01, 0.1, 1, 10, 100], 'svc__gamma': [0.001, 0.01, 0.1, 1, 10], 'svc__kernel': ['rbf','poly']}
grid = GridSearchCV(pipe_svc , param_grid = param_grid, cv = 5)
grid.fit(x_train, y_train)
print("Best cross-validation accuracy: {:.2f}". format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
print("Test set accuracy: {:.2f}". format(grid.score(x_test, y_test)))
Best cross-validation accuracy: 0.93
Best parameters: {'pca__n_components': 14, 'svc__C': 0.001, 'svc__gamma': 0.1, 'svc__kernel': 'poly'}
Test set accuracy: 0.93
grid.predict(x_test)
array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=int64)
# build a classifier
clf = RandomForestClassifier(n_estimators = 50)
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run randomized search
samples = 10 # number of random samples
randomCV = RandomizedSearchCV(clf, param_distributions = param_dist, n_iter = samples) # default cv = 3
randomCV.fit(x, y)
RandomizedSearchCV(estimator=RandomForestClassifier(n_estimators=50),
param_distributions={'bootstrap': [True, False],
'criterion': ['gini', 'entropy'],
'max_depth': [3, None],
'max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001E2F8C660B0>,
'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001E2F8B33BE0>,
'min_samples_split': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001E2F8B309A0>})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(estimator=RandomForestClassifier(n_estimators=50),
param_distributions={'bootstrap': [True, False],
'criterion': ['gini', 'entropy'],
'max_depth': [3, None],
'max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001E2F8C660B0>,
'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001E2F8B33BE0>,
'min_samples_split': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x000001E2F8B309A0>})RandomForestClassifier(n_estimators=50)
RandomForestClassifier(n_estimators=50)
randomCV.best_params_
{'bootstrap': True,
'criterion': 'gini',
'max_depth': 3,
'max_features': 9,
'min_samples_leaf': 3,
'min_samples_split': 5}
randomCV.cv_results_['mean_test_score']
array([0.93554262, 0.93554262, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93617956])
randomCV.best_estimator_
RandomForestClassifier(max_depth=3, max_features=9, min_samples_leaf=3,
min_samples_split=5, n_estimators=50)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(max_depth=3, max_features=9, min_samples_leaf=3,
min_samples_split=5, n_estimators=50)
# use a full grid over all parameters
param_grid = {"max_depth": [3, None],
"max_features": [1, 3, 10],
"min_samples_split": [2, 3, 10],
"min_samples_leaf": [1, 3, 10],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run grid search
grid_search = GridSearchCV(clf, param_grid = param_grid)
grid_search.fit(x, y)
GridSearchCV(estimator=RandomForestClassifier(n_estimators=50),
param_grid={'bootstrap': [True, False],
'criterion': ['gini', 'entropy'],
'max_depth': [3, None], 'max_features': [1, 3, 10],
'min_samples_leaf': [1, 3, 10],
'min_samples_split': [2, 3, 10]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GridSearchCV(estimator=RandomForestClassifier(n_estimators=50),
param_grid={'bootstrap': [True, False],
'criterion': ['gini', 'entropy'],
'max_depth': [3, None], 'max_features': [1, 3, 10],
'min_samples_leaf': [1, 3, 10],
'min_samples_split': [2, 3, 10]})RandomForestClassifier(n_estimators=50)
RandomForestClassifier(n_estimators=50)
grid_search.best_params_
{'bootstrap': True,
'criterion': 'gini',
'max_depth': None,
'max_features': 10,
'min_samples_leaf': 1,
'min_samples_split': 2}
grid_search.cv_results_['mean_test_score']
array([0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93936631, 0.93554262,
0.93363179, 0.93554262, 0.93872734, 0.93363179, 0.93363179,
0.93617956, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93617956, 0.93872734, 0.93809039, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
1. , 1. , 1. , 1. , 0.99936102,
0.99872204, 0.94828962, 0.95087605, 0.94767099, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93554262, 0.93490568,
0.93554262, 0.93363179, 0.93490568, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93490568, 0.93426874, 0.93426874, 0.93490568, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.97766224,
0.9789463 , 0.97892595, 0.96550945, 0.95148247, 0.9553143 ,
0.94637065, 0.93490771, 0.94129546, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93872734, 0.93872734, 0.93617956, 0.93936428,
0.93745345, 0.93745345, 0.93363179, 0.93363179, 0.93363179,
0.93426874, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.94191205,
0.94191205, 0.93872937, 0.93299485, 0.93490568, 0.93426874,
0.93363179, 0.93363179, 0.93363179, 1. , 0.99936306,
0.98917197, 0.99936102, 0.99744409, 0.99872611, 0.95913596,
0.96871655, 0.97062534, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93681651, 0.93745345, 0.93363179, 0.93426874, 0.93617956,
0.93745345, 0.93363179, 0.93426874, 0.93426874, 0.93426874,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93363179,
0.93363179, 0.93363179, 0.93363179, 0.93363179, 0.93681651,
0.93426874, 0.93363179, 0.93426874, 0.93617956, 0.93363179,
0.93363179, 0.93363179, 0.97700494, 0.98530555, 0.9859547 ,
0.96996805, 0.96550335, 0.96550131, 0.95276449, 0.95977086,
0.95275025])
grid_search.best_estimator_
RandomForestClassifier(max_features=10, n_estimators=50)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_features=10, n_estimators=50)
# fit scaler on training data
norm = MinMaxScaler().fit(x_train)
# transform training data
x_train_norm = norm.transform(x_train)
# transform testing dataabs
x_test_norm = norm.transform(x_test)
# copy of datasets
x_train_stand = x_train.copy()
x_test_stand = x_test.copy()
# apply standardization on numerical features
for i in df_signal_data.select_dtypes([np.number]).columns:
# fit on training data column
scale = StandardScaler().fit(x_train_stand[[i]])
# transform the training data column
x_train_stand[i] = scale.transform(x_train_stand[[i]])
# transform the testing data column
x_test_stand[i] = scale.transform(x_test_stand[[i]])
# Comparing Unscaled, Normalized, and Standardized Data
knn = KNeighborsRegressor(n_neighbors = 7)
rmse = []
# raw, normalized and standardized training and testing data
trainx = [x_train, x_train_norm, x_train_stand]
testx = [x_test, x_test_norm, x_test_stand]
# model fitting and measuring RMSE
for i in range(len(trainx)):
knn.fit(trainx[i], y_train)
pred = knn.predict(testx[i])
rmse.append(np.sqrt(mean_squared_error(y_test, pred)))
# visualizing the result
df_knn = pd.DataFrame({'RMSE': rmse}, index = ['Original','Normalized','Standardized'])
df_knn
| RMSE | |
|---|---|
| Original | 0.523630 |
| Normalized | 0.146599 |
| Standardized | 0.366971 |
# Comparing Unscaled, Normalized, and Standardized Data
svr = SVR(kernel = 'rbf', C = 5)
rmse = []
# raw, normalized and standardized training and testing data
trainx = [x_train, x_train_norm, x_train_stand]
testx = [x_test, x_test_norm, x_test_stand]
# model fitting and measuring RMSE
for i in range(len(trainx)):
svr.fit(trainx[i], y_train)
pred = svr.predict(testx[i])
rmse.append(np.sqrt(mean_squared_error(y_test, pred)))
# visualizing the result
df_svr = pd.DataFrame({'RMSE': rmse}, index = ['Original','Normalized','Standardized'])
df_svr
| RMSE | |
|---|---|
| Original | 0.504901 |
| Normalized | 0.082879 |
| Standardized | 0.198274 |
# Comparing Unscaled, Normalized, and Standardized Data
dt = DecisionTreeRegressor(max_depth = 10, random_state = 27)
rmse = []
# raw, normalized and standardized training and testing data
trainx = [x_train, x_train_norm, x_train_stand]
testx = [x_test, x_test_norm, x_test_stand]
# model fitting and measuring RMSE
for i in range(len(trainx)):
dt.fit(trainx[i], y_train)
pred = dt.predict(testx[i])
rmse.append(np.sqrt(mean_squared_error(y_test, pred)))
# visualizing the result
df_dt = pd.DataFrame({'RMSE': rmse}, index = ['Original','Normalized','Standardized'])
df_dt
| RMSE | |
|---|---|
| Original | 0.0 |
| Normalized | 0.0 |
| Standardized | 0.0 |
# Split x and y into training and test set in 70:30 ratio
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 7)
# it takes a list of tuples as parameter. The last entry is the call to the modelling algorithm
pipeline = Pipeline([
('scaler', StandardScaler()),
('clf', LogisticRegression())
])
# use the pipeline object as you would a regular classifier
pipeline.fit(x_train, y_train)
Pipeline(steps=[('scaler', StandardScaler()), ('clf', LogisticRegression())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('scaler', StandardScaler()), ('clf', LogisticRegression())])StandardScaler()
LogisticRegression()
y_predict = pipeline.predict(x_test)
model_score = pipeline.score(x_test, y_test)
print(model_score)
1.0
print(metrics.confusion_matrix(y_test, y_predict))
[[448 0] [ 0 23]]
E. Display and explain the classification report in detail. [3 Marks]
# Gaussian Naive Bayes on Normal Dataset
nb = GaussianNB()
nb.fit(x_train, y_train)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianNB()
modelnb_score = nb.score(x_train, y_train)
print('Accuracy Score of Training Data: ', modelnb_score)
Accuracy Score of Training Data: 0.9936131386861314
y_predictnb= nb.predict(x_test)
modelnb_score = accuracy_score(y_test, y_predictnb)
print('Accuracy Score of Test Data:', modelnb_score)
Accuracy Score of Test Data: 0.9893842887473461
#printing classification report
print("Classification Report")
print(metrics.classification_report(y_test, y_predictnb, labels=[1, 0]))
Classification Report
precision recall f1-score support
1 0.82 1.00 0.90 23
0 0.00 0.00 0.00 0
micro avg 0.82 1.00 0.90 23
macro avg 0.41 0.50 0.45 23
weighted avg 0.82 1.00 0.90 23
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
# visualizing confusion matrix
cm= confusion_matrix(y_test, y_predictnb)
plt.figure(figsize = (6, 4))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Predicted Classes', fontsize = 15)
plt.ylabel('Actual Classes', fontsize = 15)
plt.title('Confusion Matrix for GNB', fontsize = 15);
precision_nb, recall_nb, f1_score_nb, support = precision_recall_fscore_support(y_test, y_predictnb, average = 'macro')
print('Precision Score :', '%0.2f' % precision_nb)
print('Recall Score :', '%0.2f' % recall_nb)
print('F1-Score:', '%0.2f' % f1_score_nb)
nb_acc= accuracy_score(y_test, y_predictnb)
print('Accuracy Score :','%0.2f' % nb_acc)
print('Thresholdnb :','%0.2f' % 0.016)
Thresholdnb=0.016
Precision Score : 0.91 Recall Score : 0.99 F1-Score: 0.95 Accuracy Score : 0.99 Thresholdnb : 0.02
# Gaussian Naive Bayes on Under sampled Data
nbu = GaussianNB()
nbu.fit(x_under, y_under)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GaussianNB()
modelnbu_score = nbu.score(x_under,y_under)
print('Accuracy Score of Training Data: ', modelnbu_score)
Accuracy Score of Training Data: 1.0
y_predictnbu= nbu.predict(x_test)
modelnbu_score = accuracy_score(y_test, y_predictnbu)
print('Accuracy Score of Test Data:', modelnbu_score)
Accuracy Score of Test Data: 0.9766454352441614
#printing classification report
print("Classification Report")
print(metrics.classification_report(y_test, y_predictnbu, labels=[1, 0]))
Classification Report
precision recall f1-score support
1 0.68 1.00 0.81 23
0 0.00 0.00 0.00 0
micro avg 0.68 1.00 0.81 23
macro avg 0.34 0.50 0.40 23
weighted avg 0.68 1.00 0.81 23
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
# visualizing confusion matrix
cm= confusion_matrix(y_test, y_predictnbu)
plt.figure(figsize = (6, 4))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.ylabel('Actual Classes', fontsize = 15)
plt.xlabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for GNB Under sampled', fontsize = 15);
precision_nbu, recall_nbu, f1_score_nbu, support = precision_recall_fscore_support(y_test, y_predictnbu, average = 'macro')
print('Precision Score :', '%0.2f' % precision_nbu)
print('Recall Score :', '%0.2f' % recall_nbu)
print('F1-Score:', '%0.2f' % f1_score_nbu)
nbu_acc= accuracy_score(y_test, y_predictnbu)
print('Accuracy Score :','%0.2f' % nbu_acc)
print('Thresholdnbu:','%0.2f' % 0.4753)
Thresholdnbu=0.4753
Precision Score : 0.84 Recall Score : 0.99 F1-Score: 0.90 Accuracy Score : 0.98 Thresholdnbu: 0.48
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 500, num = 10)]
random_grid = {'n_estimators': n_estimators, 'criterion':['gini','entropy']}
# RandomForest on Random over sampled Dataset
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 5, verbose=2, random_state=90, n_jobs = -1)
rf_random.fit(x_over, y_over)
Fitting 5 folds for each of 10 candidates, totalling 50 fits
RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
param_distributions={'criterion': ['gini', 'entropy'],
'n_estimators': [50, 100, 150, 200, 250,
300, 350, 400, 450,
500]},
random_state=90, verbose=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
param_distributions={'criterion': ['gini', 'entropy'],
'n_estimators': [50, 100, 150, 200, 250,
300, 350, 400, 450,
500]},
random_state=90, verbose=2)RandomForestClassifier()
RandomForestClassifier()
modelrfg1_score=rf_random.score(x_over,y_over)
print('Accuracy Score of Training Data: ', modelrfg1_score)
Accuracy Score of Training Data: 1.0
y_predictrfg1= rf_random.predict(x_test)
modelrfg1_score = accuracy_score(y_test, y_predictrfg1)
print('Accuracy Score of Test Data:', modelrfg1_score)
Accuracy Score of Test Data: 0.9957537154989384
#printing classification report
print("Classification Report")
print(metrics.classification_report(y_test, y_predictrfg1, labels=[1, 0]))
Classification Report
precision recall f1-score support
1 1.00 0.91 0.95 23
0 0.00 0.00 0.00 0
micro avg 1.00 0.91 0.95 23
macro avg 0.50 0.46 0.48 23
weighted avg 1.00 0.91 0.95 23
C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result)) C:\Users\Bhavya Govindrao\anaconda3\lib\site-packages\sklearn\metrics\_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
# visualizing confusion matrix
cm= confusion_matrix(y_test, y_predictrfg1)
plt.figure(figsize = (6, 4))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.ylabel('Actual Classes', fontsize = 15)
plt.xlabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for RF Over Sampled', fontsize = 15);
precision_rfo, recall_rfo, f1_score_rfo, support = precision_recall_fscore_support(y_test, y_predictrfg1, average = 'macro')
print('Precision Score :', '%0.2f' % precision_rfo)
print('Recall Score :', '%0.2f' % recall_rfo)
print('F1-Score:', '%0.2f' % f1_score_rfo)
rfo_acc= accuracy_score(y_test, y_predictrfg1)
print('Accuracy Score :','%0.2f' % rfo_acc)
print('Thresholdrf :','%0.2f' % 0.1688)
Thresholdrf=0.1688
Precision Score : 1.00 Recall Score : 0.96 F1-Score: 0.98 Accuracy Score : 1.00 Thresholdrf : 0.17
F. Apply the above steps for all possible models that you have learnt so far. [5 Marks]
lrcl = LogisticRegression()
nbcl = GaussianNB()
dtcl = DecisionTreeClassifier()
lr = LinearRegression()
knncl = KNeighborsClassifier()
svcl= SVC()
svr = SVR()
rfcl = RandomForestClassifier()
bgcl = BaggingClassifier()
nncl = MLPClassifier()
for clf, label in zip([lrcl , nbcl, dtcl, lr, knncl, svcl, svr, rfcl, bgcl, nncl],
['lrcl' , 'nbcl', 'dtcl', 'lr', 'knncl', 'svcl', 'svr', 'rfcl', 'bgcl', 'nncl']):
print("model name: " , label)
print("\n model_hyperparameters \n" , clf.get_params())
model name: lrcl
model_hyperparameters
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
model name: nbcl
model_hyperparameters
{'priors': None, 'var_smoothing': 1e-09}
model name: dtcl
model_hyperparameters
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}
model name: lr
model_hyperparameters
{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}
model name: knncl
model_hyperparameters
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
model name: svcl
model_hyperparameters
{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
model name: svr
model_hyperparameters
{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}
model name: rfcl
model_hyperparameters
{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
model name: bgcl
model_hyperparameters
{'base_estimator': 'deprecated', 'bootstrap': True, 'bootstrap_features': False, 'estimator': None, 'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
model name: nncl
model_hyperparameters
{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 200, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}
A. Display and compare all the models designed with their train and test accuracies. [1 Marks]
B. Select the final best trained model along with your detailed comments for selecting this model. [1 Marks]
modellists = []
modellists.append(['Gaussian NB Normal Data', nb_acc * 100, recall_nb * 100, precision_nb * 100,f1_score_nb*100,Thresholdnb])
modellists.append(['Gausian NB under samples data', nbu_acc* 100, recall_nbu * 100, precision_nbu* 100,f1_score_nbu*100,Thresholdnbu])
modellists.append(['Random Forest Over sampled Data', rfo_acc * 100, recall_rfo * 100, precision_rfo * 100,f1_score_rfo*100,Thresholdrf])
model_df = pd.DataFrame(modellists, columns = ['Model', 'Accuracy Scores on Test', 'Recall Score', 'Precision Score','F1 Score','Threshold'])
model_df
| Model | Accuracy Scores on Test | Recall Score | Precision Score | F1 Score | Threshold | |
|---|---|---|---|---|---|---|
| 0 | Gaussian NB Normal Data | 98.938429 | 99.441964 | 91.071429 | 94.817456 | 0.0160 |
| 1 | Gausian NB under samples data | 97.664544 | 98.772321 | 83.823529 | 89.729408 | 0.4753 |
| 2 | Random Forest Over sampled Data | 99.575372 | 95.652174 | 99.777778 | 97.615914 | 0.1688 |
C. Pickle the selected model for future use. [2 Marks]
import pickle
regressor = LinearRegression()
#Fitting model with training data
regressor.fit(x, y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
pickle.dump(regressor, open('model.pkl','wb'))
# Loading model to compare the results
model = pickle.load(open('model.pkl','rb'))
model
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LinearRegression()
D. Write your conclusion on the results. [1 Marks]